import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose, STL
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import pacf, acf
from statsmodels.stats.stattools import durbin_watson
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from utils import *
%matplotlib inline
data = pd.read_csv('vodafone_data.csv')
data.head()
data['Date'] = pd.to_datetime(data['Date']).dt.date
data['Date_index'] = pd.DatetimeIndex(data.Date)
data['year'] = pd.to_datetime(data['Date']).dt.year
data['quarter'] = pd.to_datetime(data['Date']).dt.quarter
data['month'] = pd.to_datetime(data['Date']).dt.month
data.Date.agg({min, max})
data.shape
data.year.value_counts().plot(kind='bar', rot='45');
plt.title('Qty of observations for each year');
data.year.value_counts(normalize=True).plot(kind='bar', rot='45');
plt.title('Percentage of observations for each year');
sns.distplot(data.Close);
fig, axs = plt.subplots(nrows=data.year.nunique(), ncols=1, figsize=(9, 35))
for idx, year in enumerate(data.year.unique()):
sns.distplot(data.loc[data.year == year, 'Close'], ax=axs[idx]);
axs[idx].set_title('Distplot for {}'.format(year));
missing_dates = pd.date_range(start='2016-02-11', end='2021-02-11', freq='D').difference(data.Date)
# we have 568 missing dates
missing_dates.nunique()
missing_dates
missing_dates = pd.DataFrame(missing_dates, columns=['date'])
missing_dates['year'] = missing_dates['date'].dt.year
missing_dates.groupby('year').date.count().plot(kind='bar', rot='45');
plt.title('Qty missing values for each year');
missing_dates.year.value_counts(normalize=True).plot(kind='bar', rot='45');
plt.title('Percentage of missing values for each year');
missing_dates['month'] = missing_dates['date'].dt.month
missing_dates['day'] = missing_dates['date'].dt.day
missing_dates.head()
missing_dates.day.value_counts(normalize=True).head()
missing_dates.month.value_counts(normalize=True).head()
data.head()
data_no_missing = data.copy(deep=True)
data_no_missing.head()
data_no_missing = data_no_missing.set_index('Date_index').asfreq('D')
data_no_missing.head()
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
data_no_missing[col] = data_no_missing[col].interpolate()
data_no_missing.head()
data_no_missing = data_no_missing.reset_index(drop=False)
data_no_missing.head(3)
data_no_missing['Date'] = pd.to_datetime(data_no_missing['Date_index']).dt.date
data_no_missing['year'] = pd.to_datetime(data_no_missing['Date']).dt.year
data_no_missing['quarter'] = pd.to_datetime(data_no_missing['Date']).dt.quarter
data_no_missing['month'] = pd.to_datetime(data_no_missing['Date']).dt.month
data_no_missing.head()
fig = plt.figure(figsize=(10, 8))
sns.boxplot(data=data_no_missing, x=data_no_missing.year, y='Close');
plt.title('Box-Plots for each year');
data_no_missing.set_index('Date').Close.plot(title='Closing price time series', figsize=(10, 7));
fig, axs = plt.subplots(nrows=data.year.nunique(), ncols=1, figsize=(10, 50))
for idx, year in enumerate(data.year.unique()):
data_no_missing[data_no_missing.year == year].set_index('Date').Close.plot(
title='Closing price time series for {}'.format(year), ax=axs[idx], rot='45');
data_no_missing.set_index('Date_index').Close.resample('Q').mean().plot(
title='Quarterely Closing price time series', rot='45');
fig, axs = plt.subplots(nrows=1, ncols=data.year.nunique(), figsize=(30, 10))
for idx, year in enumerate(data.year.unique()):
data_no_missing[data_no_missing.year == year].set_index('Date_index').Close.resample('Q').mean().plot(
title='Quarterely Closing price time series for {}'.format(year), ax=axs[idx], rot='45');
data_no_missing.set_index('Date_index').Close.resample('M').mean().plot(
title='Quarterely Closing price time series', rot='45');
fig, axs = plt.subplots(nrows=1, ncols=data.year.nunique(), figsize=(30, 10))
for idx, year in enumerate(data.year.unique()):
data_no_missing[data_no_missing.year == year].set_index('Date_index').Close.resample('M').mean().plot(
title='Quarterely Closing price time series for {}'.format(year), ax=axs[idx], rot='45');
stationarity_test(data_no_missing.Close, smt.adfuller)
stationarity_test(data_no_missing.Close, smt.kpss)
data_no_missing.head(1)
data_no_missing = data_no_missing.set_index('Date_index')
sesonal_decomposition_additive_5 = plot_decomposition(data_no_missing.Close, 'additive', 5)
stationarity_test(sesonal_decomposition_additive_5.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_additive_5.resid.dropna(), smt.kpss)
sesonal_decomposition_additive_20 = plot_decomposition(data_no_missing.Close, 'additive', 20)
stationarity_test(sesonal_decomposition_additive_20.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_additive_20.resid.dropna(), smt.kpss)
sesonal_decomposition_additive_253 = plot_decomposition(data_no_missing.Close, 'additive', 253)
stationarity_test(sesonal_decomposition_additive_253.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_additive_253.resid.dropna(), smt.kpss)
sesonal_decomposition_multiplicative_5 = plot_decomposition(data_no_missing.Close, 'multiplicative', 5)
stationarity_test(sesonal_decomposition_multiplicative_5.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_multiplicative_5.resid.dropna(), smt.kpss)
sesonal_decomposition_multiplicative_20 = plot_decomposition(data_no_missing.Close, 'multiplicative', 20)
stationarity_test(sesonal_decomposition_multiplicative_20.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_multiplicative_20.resid.dropna(), smt.kpss)
sesonal_decomposition_multiplicative_253 = plot_decomposition(data_no_missing.Close, 'multiplicative', 253)
stationarity_test(sesonal_decomposition_multiplicative_253.resid.dropna(), smt.adfuller)
stationarity_test(sesonal_decomposition_multiplicative_253.resid.dropna(), smt.kpss)
data_no_missing = data_no_missing.reset_index(drop=False)
all_metrics_df, all_metrics_df_test = make_cross_validation(data_no_missing, test_size=100)
all_metrics_df
all_metrics_df_test
fig, axs = plt.subplots(nrows=all_metrics_df.shape[1], ncols=1, figsize=(15, 30))
for idx, metric_title in enumerate(all_metrics_df.columns):
axs[idx].plot(all_metrics_df[metric_title])
axs[idx].set_title('{} distribution through iterations'.format(metric_title))
fig, axs = plt.subplots(nrows=all_metrics_df.shape[1], ncols=1, figsize=(15, 30))
for idx, metric_title in enumerate(all_metrics_df.columns):
axs[idx].plot(all_metrics_df_test[metric_title])
axs[idx].set_title('{} distribution through iterations'.format(metric_title))